{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Loading library list...\n",
      "Done\n"
     ]
    }
   ],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "import wrds\n",
    "from pandas.tseries.offsets import *\n",
    "###################\n",
    "# Connect to WRDS #\n",
    "###################\n",
    "# Please enter your own username and password\n",
    "conn = wrds.Connection(wrds_username='minggao97')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# CRSP monthly"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "###################\n",
    "# CRSP Block      #\n",
    "###################\n",
    "crsp_m = conn.raw_sql(\"\"\"\n",
    "                      select a.permno, a.permco, a.date, a.ret, a.retx,a.shrout, a.prc, a.cfacshr,\n",
    "                             b.shrcd, b.exchcd, b.siccd, b.ncusip,\n",
    "                             c.dlstcd, c.dlret\n",
    "                      from crsp.msf as a\n",
    "                      left join crsp.msenames as b\n",
    "                      on a.permno=b.permno and b.namedt<=a.date and a.date<=b.nameendt\n",
    "                      left join crsp.msedelist as c\n",
    "                      on a.permno=c.permno AND date_trunc('month', a.date) = date_trunc('month', c.dlstdt)\n",
    "                      where a.date between '01/01/1980' and '12/31/2023'\n",
    "                      and b.exchcd between 1 and 3\n",
    "                      and b.shrcd between 10 and 11\n",
    "                      \"\"\",\n",
    "                      date_cols=['date'])\n",
    "\n",
    "# change variable format to int\n",
    "crsp_m[['permco','permno','shrcd','exchcd']]=crsp_m[['permco','permno','shrcd','exchcd']].astype(int)\n",
    "\n",
    "# Line up date to be end of month\n",
    "crsp_m['YearMonth']=crsp_m['date']+MonthEnd(0)\n",
    "\n",
    "# Incorporate delisting return\n",
    "# when missing, set to -0.5\n",
    "crsp_m['dlret'] = np.where((~crsp_m['dlstcd'].isna()) & (crsp_m['dlret'].isna()), -0.5, crsp_m['dlret'])\n",
    "crsp_m['dlret'] = crsp_m['dlret'].fillna(0)\n",
    "crsp_m['retadj'] = (1+crsp_m['ret'])*(1+crsp_m['dlret']) - 1\n",
    "crsp_m['retadj'] = np.where((crsp_m['ret'].isna()) & (crsp_m['dlret']!=0), crsp_m['dlret'], crsp_m['ret'])\n",
    "\n",
    "crsp_m = crsp_m.sort_values(by=['permno','YearMonth']).reset_index(drop=True)\n",
    "\n",
    "crsp_m.to_parquet('../data/WRDS/crsp_m.parquet')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Compustat Annual"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "####################\n",
    "# Compustat Annual #\n",
    "####################\n",
    "comp = conn.raw_sql(\"\"\"\n",
    "                    select a.gvkey, a.datadate, a.fyear, a.csho, a.at, a.pstkl, a.txditc,\n",
    "                           a.pstkrv, a.seq, a.pstk, a.ppegt, a.invt, a.lt, a.sich, a.ib, a.oancf,\n",
    "                           a.act, a.dlc, a.che, a.lct, a.dvc, a.epspi, a.epspx,\n",
    "                           a.ajex,\n",
    "                           a.sale, a.ao\n",
    "                    from comp.funda as a\n",
    "                    where indfmt='INDL'\n",
    "                    and datafmt='STD'\n",
    "                    and popsrc='D'\n",
    "                    and consol='C'\n",
    "                    and curcd = 'USD'\n",
    "                    and datadate >= '01/01/1965'\n",
    "                    \"\"\", date_cols=['datadate'])\n",
    "\n",
    "# create preferrerd stock\n",
    "comp['ps']=np.where(comp['pstkrv'].isnull(), comp['pstkl'], comp['pstkrv'])\n",
    "comp['ps']=np.where(comp['ps'].isnull(),comp['pstk'], comp['ps'])\n",
    "comp['ps']=np.where(comp['ps'].isnull(),0,comp['ps'])\n",
    "\n",
    "comp['txditc']=comp['txditc'].fillna(0)\n",
    "\n",
    "# create book equity\n",
    "comp['be']=comp['seq']+comp['txditc']-comp['ps']\n",
    "\n",
    "# accrual change in current assets (act) + in debt in current liabilities (dlc)\n",
    "# -  change in cash and short-term investments (che) - change in current liabilities (lct)\n",
    "comp['act']=comp['act'].fillna(0)\n",
    "comp['dlc']=comp['dlc'].fillna(0)\n",
    "comp['che']=comp['che'].fillna(0)\n",
    "comp['lct']=comp['lct'].fillna(0)\n",
    "# comp['year']=comp['datadate'].dt.year\n",
    "comp.sort_values(by=['gvkey', 'datadate'], inplace=True)\n",
    "comp[['act_ch','dlc_ch','che_ch','lct_ch']] = comp.groupby('gvkey')[['act','dlc','che','lct']].diff()\n",
    "comp['acc'] = comp['act_ch'] + comp['dlc_ch'] - comp['che_ch'] - comp['lct_ch']\n",
    "\n",
    "# Average Asset in recent 2 years\n",
    "comp['at_l1'] = comp.groupby('gvkey')['at'].shift(1)\n",
    "comp['at_avg'] = comp[['at','at_l1']].mean(axis=1)\n",
    "# Asset growth\n",
    "comp['ag']=comp.groupby('gvkey')['at'].pct_change(fill_method=None)\n",
    "# change in PPE, unstandardized\n",
    "comp['ppegt_diff'] = comp.groupby('gvkey')['ppegt'].diff()\n",
    "# change in AO, unstandardized\n",
    "comp['ao_diff'] = comp.groupby('gvkey')['ao'].diff()\n",
    "# Sales Growth 1y/3y/5y\n",
    "comp['sale_l1'] = comp.groupby('gvkey')['sale'].shift(1)\n",
    "comp['sale_l3'] = comp.groupby('gvkey')['sale'].shift(3)\n",
    "comp['sale_l5'] = comp.groupby('gvkey')['sale'].shift(5)\n",
    "comp['sg_1y'] = comp['sale']/comp['sale_l1'] - 1\n",
    "comp['sg_3y'] = (comp['sale']/comp['sale_l3'])**(1/3) - 1\n",
    "comp['sg_5y'] = (comp['sale']/comp['sale_l5'])**(1/5) - 1\n",
    "## Net Stock Issuance\n",
    "# adjusted-shares\n",
    "comp['adj_csho'] = comp['csho'] * comp['ajex']\n",
    "comp['adj_csho_l1'] = comp.groupby('gvkey')['adj_csho'].shift(1)\n",
    "comp['nsi'] = np.log(comp['adj_csho']/comp['adj_csho_l1'])\n",
    "\n",
    "comp.to_parquet('../data/WRDS/compa.parquet')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# CCM link"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#######################\n",
    "# CCM Block           #\n",
    "#######################\n",
    "ccm=conn.raw_sql(\"\"\"\n",
    "                  select gvkey, lpermno as permno, linktype, linkprim,\n",
    "                  linkdt, linkenddt\n",
    "                  from crsp.ccmxpf_linktable\n",
    "                  where substr(linktype,1,1)='L'\n",
    "                  and (linkprim ='C' or linkprim='P')\n",
    "                  \"\"\", date_cols=['linkdt', 'linkenddt'])\n",
    "ccm.to_parquet('../data/WRDS/ccm.parquet')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Financial Ratio"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Please Download From:\n",
    "# https://wrds-www.wharton.upenn.edu/pages/get-data/financial-ratios-suite-wrds/\n",
    "# and save to the following path: '../data/WRDS/financial_ratio.dta'"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# IBES unadjusted summary"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# WRDS path: IBES/IBES Academic/Unadjusted Summary/Summary Statistics\n",
    "# Measures: EPS\n",
    "# FPI: 0, 1, 2, 3, 4, 5, 6, 7, 8, 9\n",
    "# and save to the following path: '../data/WRDS/Forecast_EPS_summary_unadjusted_2023.dta'\n",
    "\n",
    "# We save the data into parquet for faster loading\n",
    "# just run the following code:\n",
    "# df = pd.read_stata('../data/WRDS/Forecast_EPS_summary_unadjusted_2023.dta')\n",
    "# df.to_parquet('../data/WRDS/Forecast_EPS_summary_unadjusted_2023.parquet')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# IBES unadjusted actual"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 1. IBES Detail Actual\n",
    "# WRDS path: IBES/IBES Academic/Unadjusted Detail/Actuals\n",
    "# Measures: EPS\n",
    "# Periodicity: ANN/QTR\n",
    "# and save to the following path: '../data/WRDS/Actual_EPS_detail_unadjusted_2023.dta'\n",
    "\n",
    "# 2. IBES Summary Actual\n",
    "# WRDS path: IBES/IBES Academic/Unadjusted Summary/Actuals, Pricing and Ancillary\n",
    "# Measures: EPS\n",
    "# and save to the following path: '../data/WRDS/Actual_EPS_summary_unadjusted_2023.dta'"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# CRSP-IBES link table"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# WRDS path: Linking Queries by WRDS/IBES CRSP Link (Beta)\n",
    "# and save to the following path: '../data/WRDS/iclink_WRDS.csv'"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Other"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Publicly Available Data Provided in the Data Folder:\n",
    "1. **Macro Variables**: RGDP, RCON, INDPROD, UNEMP  \n",
    "   - **File**: `../data/Macro/XXXX.xlsx`  \n",
    "   - **Source**: Federal Reserve Bank of Philadelphia  \n",
    "\n",
    "2. **Factors**: Fama-French, HMXZ, SY, DHS  \n",
    "   - **File**: `../data/Other/ff5_factors_m.csv`  \n",
    "   - **Source**: Authors' websites  \n",
    "\n",
    "3. **FF49 Industry Classification**  \n",
    "   - **File**: `../data/Other/Siccodes49.csv`  \n",
    "   - **Source**: Kenneth R. French's website  \n",
    "  \n",
    "## Publicly Available Data Not Provided:\n",
    "- **Anomalies Data**:  \n",
    "  - **File**: `../data/Other/signed_predictors_dl_wide.csv`  \n",
    "  - **Source**: Open Source Asset Pricing (https://www.openassetpricing.com/)  \n",
    "  - **Version**: 1.3  \n",
    "\n",
    "## Non-Public Data:\n",
    "- **BHL Data**:  \n",
    "  - **File**: `../data/Other/Conditional_Bias.csv`  \n",
    "  - **Access**: Please contact BHL for this data.  "
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "base",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.7"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
